1 Setup

suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(gplots))
source(here("UPSCb-common/src/R/plot.multidensity.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))

Function for selecting significant genes (FDR < 0.01, |log2FC| => 0.5)

sigDeg <- function(res, p = 0.01, log2fc = 0.5, genes = "all") {
  if(genes == "all") return(res[res$padj<p & !is.na(res$padj) & abs(res$log2FoldChange) >= log2fc,])
  if(genes == "up") return(res[res$padj<p & !is.na(res$padj) & res$log2FoldChange >= log2fc,])
  if(genes == "down") return(res[res$padj<p & !is.na(res$padj) & res$log2FoldChange <= -log2fc,])
}
pal <- brewer.pal(12,"Paired")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/zygoticEmbryogenesis/doc/ZE_Dataset_v4.csv",
                    col_types = cols(col_character(),
                                     col_character(),
                                     col_factor(),
                                     col_character(),
                                     col_factor(),
                                     col_character(),
                                     col_double(),
                                     col_double())) %>% 
  mutate(Tissue=factor(Tissue)) %>% 
  mutate(Time=factor(Time))

# correct typo
samples$User.ID[samples$NGI.ID == "P11562_201"] <- sub("_S", "", samples$User.ID[samples$NGI.ID == "P11562_201"])

sample information including technical replicates (sequencing lanes)

samples_techRep <- samples[rep(seq_len(nrow(samples)), each = 2), ]
samples_techRep$Lane <- rep(c("L001","L002"), 57)
samples_techRep$TechRep <- paste(samples_techRep$NGI.ID, samples_techRep$Lane, sep = "_")

2 Prepare data

2.1 Raw data

filelist <- list.files(here("data/RNA-Seq/salmon"), 
                          recursive = TRUE,
                          pattern = "quant.sf",
                          full.names = TRUE)

Filter files from folder tmp

filelist <- str_subset(filelist, "tmp", negate = TRUE)

names(filelist) <- str_extract(filelist, "P11562_.+_L00[1,2]")
names(filelist) <- sub("_S[0-9]+", "", names(filelist))

setdiff(names(filelist), samples_techRep$TechRep)
## character(0)
setdiff(samples_techRep$TechRep, names(filelist))
## [1] "P11562_204_L002"
# One technical replicate is missing for the sample 204 (738-FMG).
samples_techRep <- filter(samples_techRep, !grepl("P11562_204_L002",TechRep))

stopifnot(all(str_which(names(filelist), samples_techRep$TechRep) == 1:length(filelist)))

# Exclude sample representing time point B10 as there is only one replicate
samples_techRep <- filter(samples_techRep, !grepl("B10",Time))
filelist <- filelist[names(filelist) %in% samples_techRep$TechRep]
samples <- filter(samples, !grepl("B10", Time))

Read the expression at the gene level (there is one transcript per gene)

geneExpr <- suppressMessages(tximport(files = filelist, 
                                  type = "salmon",txOut=TRUE))
counts_techRep <- round(geneExpr$counts)

# BQA -----------------------------------------------------------------  

2.2 Check technical replicates

2.2.1 Prepare DESeq2 object

dds_techRep <- DESeqDataSetFromMatrix(
countData = counts_techRep,
colData = samples_techRep,
design =  ~Replicate)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
# As using design ~Tissue*Time gives an error, I will use ~Replicate, as I am interested in blind = TRUE evaluation. 
# Later, after joining technical replicates and before doing DE I will use design model ~Tissue*Time  

# Error in checkFullRank(modelMatrix) : 
#   the model matrix is not full rank, so the model cannot be fit as specified.
# Levels or combinations of levels without any samples have resulted in
# column(s) of zeros in the model matrix.
# 
# Please read the vignette section 'Model matrix not full rank':
#   
#   vignette('DESeq2')  

2.2.2 PCA

Transformation of the data for visualisation purposes

vsd_t <- varianceStabilizingTransformation(dds_techRep, blind = TRUE)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
vst_t <- assay(vsd_t)
vst_t <- vst_t - min(vst_t)

meanSdPlot(vst_t[rowSums(counts_techRep)>0,])

PCA

pc <- prcomp(t(vst_t))
percent <- round(summary(pc)$importance[2,]*100)

plot(cumsum(percent))

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(vsd_t)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=TechRep)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

2.2.3 Outlier

Sample B4 is an outlier, it is in the group of samples that represent mature seeds. Exclude it from further analysis.

# remove P11562_112 from count table 
counts_techRep <- counts_techRep[ , str_subset(colnames(counts_techRep), "P11562_112_L00*", negate = TRUE)]
# remove P11562_112 from sample list
samples_techRep <- filter(samples_techRep, !grepl("112",NGI.ID))
samples <- filter(samples, !grepl("112",NGI.ID))

stopifnot(all(str_which(colnames(counts_techRep), samples_techRep$TechRep) == 1:length(colnames(counts_techRep))))

2.2.4 Join technical replicates

counts <- sapply(split.data.frame(t(counts_techRep),samples_techRep$NGI.ID),colSums)
stopifnot(all(str_which(colnames(counts), samples$NGI.ID) == 1:length(colnames(counts))))

save counts and sample information

write.csv(counts, file = here("analysis/salmon/ZE-allStages_unnormalised-counts.csv"), quote = FALSE)
write.csv(samples, file = here("doc/sample_info.csv"), quote = FALSE)
write.csv(samples_techRep, file = here("doc/sample_info_techRep.csv"), quote = FALSE)

2.3 Analysis of biological replicates

2.3.1 Prepare DESeq object (design Replicate)

dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = samples,
                              design =  ~Replicate)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

2.3.1.1 PCA

Transformation of the data for visualisation purposes

vsd <- varianceStabilizingTransformation(dds, blind = TRUE)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
vst <- assay(vsd)
vst <- vst - min(vst)

meanSdPlot(vst[rowSums(counts)>0,])

PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)

plot(cumsum(percent))

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(vsd)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
# dds: duplicated S, ~Tissue*Time----------------------------------------------------  

2.3.2 Prepare DESeq object (design ~Tissue*Time)

Duplicate samples from first three stages and rename them from S to FMG and ZE
Change sample information

samples_S <- samples[samples$Tissue == "S", ]
# S to FMG
samples_StoFMG <- samples_S
samples_StoFMG[ , c("Tissue", "Replicate")] <- apply(samples_S[ , c("Tissue", "Replicate")], 2, function(x){gsub("S", "FMG", x)})
samples_StoFMG$User.ID <- paste(samples_StoFMG$User.ID, samples_StoFMG$Tissue, sep = "_")
# S to ZE
samples_StoZE <- samples_S
samples_StoZE[ , c("Tissue", "Replicate")] <- apply(samples_S[ , c("Tissue", "Replicate")], 2, function(x){gsub("S", "ZE", x)})
samples_StoZE$User.ID <- paste(samples_StoZE$User.ID, samples_StoZE$Tissue, sep = "_")
# make up new NGI.ID for additional samples created above (as first samples start with 1 and replaced samples start with 2, start these with 3)
samples_StoZE$NGI.ID <- sub("_1", "_3", samples_StoZE$NGI.ID)
samples_StoZE$NGI.ID <- sub("_201", "_309", samples_StoZE$NGI.ID)
# join all in new sample info
samples_duplS <- rbind(samples_StoFMG, samples_StoZE, samples[samples$Tissue != "S", ]) 

Change count table

# duplicate counts and give them new NGI.IDs
counts_StoZE <- counts[ ,samples$Tissue == "S"]
colnames(counts_StoZE) <- samples_StoZE$NGI.ID 
# join all in new count table
counts_duplS <- cbind(counts, counts_StoZE)

Reorder counts to match sample info

counts_duplS <- counts_duplS[ , match(samples_duplS$NGI.ID, colnames(counts_duplS))]

Save counts and sample info with duplicated “S” samples

write.csv(counts_duplS, file = here("analysis/salmon/ZE-allStages_unnormalised-counts_duplS.csv"), quote = FALSE)
write.csv(samples_duplS, file = here("doc/sample_info_duplS.csv"), quote = FALSE)

Prepare DESeq object

dds <- DESeqDataSetFromMatrix(
  countData = counts_duplS,
  colData = samples_duplS,
  design = ~ Tissue * Time)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## factor levels were dropped which had no samples
# check factor levels
levels(dds$Tissue)
## [1] "FMG" "ZE"
levels(dds$Time)
## [1] "B1" "B2" "B3" "B4" "B5" "B6" "B7" "B8" "B9"

Save dds object

save(dds, file = here("analysis/salmon/ZE-allStages_duplS_dds.rda"))

#’ #’ {r empty,eval=FALSE,echo=FALSE} #' #’ #’ # Session Info #’ {r session info, echo=FALSE} #' sessionInfo() #'